Spotify Music Recommender using KNN Model
import os
import numpy as np
import random
import pandas as pd
import collections
from sklearn import set_config
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.cluster import DBSCAN
from sklearn.neighbors import NearestNeighbors
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.preprocessing import MinMaxScaler, LabelEncoder
from sklearn.manifold import TSNE
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt
from sklearn import set_config
import seaborn as sns
import matplotlib.cm as cm
sns.set()
pd.set_option('display.max_columns', None)
pd.set_option('display.expand_frame_repr', False)
pd.set_option('max_colwidth', None)
!wget -q --show-progress https://github.com/RecoHut-Datasets/spotify/raw/v1/20210824_212829_artists.tsv
!wget -q --show-progress https://github.com/RecoHut-Datasets/spotify/raw/v1/20210824_212829_audios.tsv
!wget -q --show-progress https://github.com/RecoHut-Datasets/spotify/raw/v1/20210824_212829_playlists.tsv
!wget -q --show-progress https://github.com/RecoHut-Datasets/spotify/raw/v1/20210824_212829_tracks.tsv
!wget -q --show-progress https://github.com/RecoHut-Datasets/spotify/raw/v1/des_artist.csv
!wget -q --show-progress https://github.com/RecoHut-Datasets/spotify/raw/v1/des_audio.csv
!wget -q --show-progress https://github.com/RecoHut-Datasets/spotify/raw/v1/des_playlist.csv
!wget -q --show-progress https://github.com/RecoHut-Datasets/spotify/raw/v1/des_tracks.csv
pd_artist = pd.read_csv('20210824_212829_artists.tsv', sep='\t')
pd_audio = pd.read_csv('20210824_212829_audios.tsv', sep='\t')
pd_playlist = pd.read_csv('20210824_212829_playlists.tsv', sep='\t')
pd_track = pd.read_csv('20210824_212829_tracks.tsv', sep='\t')
pd_track.columns.to_list()
pd_track
pd_track.info()
pd.read_csv('des_tracks.csv')
pd_audio.columns.to_list()
pd_audio
pd_audio.info()
pd.read_csv('des_audio.csv')
pd_playlist.columns.to_list()
pd_playlist
pd_playlist.info()
pd.read_csv('des_playlist.csv')
pd_artist.columns.to_list()
pd_artist
pd_artist.info()
pd.read_csv('des_artist.csv')
pd.plotting.scatter_matrix(pd_audio, diagonal="kde", figsize=(20, 20))
plt.show()
pd_track = pd.merge(pd_track, pd_audio, left_on = 'track_id', right_on = 'id')
pd_track
pd_full = pd.merge(pd_track, pd_playlist, left_on = 'playlist_id', right_on = 'playlist_id')
pd_full = pd_full.drop('playlist_name', axis = 1)
pd_full = pd_full.drop('playlist_description', axis = 1)
pd_full = pd_full.drop('id', axis = 1)
pd_full = pd_full.drop_duplicates()
pd_full = pd_full.fillna('None')
pd_full.info()
names = list(pd_track.select_dtypes(np.number).columns)
data = pd_track[names]
corr = data.corr()
# plot correlation matrix
plt.figure(figsize=(15, 15))
ax = sns.heatmap(
corr, annot=True,
vmin=-1, vmax=1, center=0,
cmap=sns.diverging_palette(20, 220, n=200),
square=True
)
ax.set_xticklabels(
ax.get_xticklabels(),
rotation=45,
horizontalalignment='right'
);
def generate_playlist(pd_track, pd_playlist, num_playlist_to_test=100, threshold=30):
playlist_sampled = pd_playlist[pd_playlist["playlist_num_tracks"] >= threshold]
playlist_selected = playlist_sampled["playlist_id"].sample(n=num_playlist_to_test, random_state=0)
track_ids = {}
for list_id in playlist_selected:
track_ids[list_id] = list(pd_track[pd_track['playlist_id'] == list_id]['track_id'])
return track_ids
def generate_test_playlist(track_ids, missing_rate=0.2):
track_id_for_test = {}
for key in track_ids:
all_tracks = track_ids[key]
n = len(all_tracks)
nums_songs_to_test = int(n * (1 - missing_rate))
track_id_for_test[key] = all_tracks[:nums_songs_to_test]
return track_id_for_test
def r_precision(prediction, label):
"""
Calculate r-precision: union(p,l)/size
:rtype: r-precision score
"""
prediction = list(set(prediction))
label = list(set(label))
try:
score = len(list(set(prediction) & set(label))) / len(label)
except Exception:
print(f"division by zero prediction: {prediction}, label: {label}, len(label): {len(label)}")
return score
pd_artist = pd.read_csv('20210824_212829_artists.tsv', sep='\t')
pd_audio = pd.read_csv('20210824_212829_audios.tsv', sep='\t')
pd_playlist = pd.read_csv('20210824_212829_playlists.tsv', sep='\t')
pd_track = pd.read_csv('20210824_212829_tracks.tsv', sep='\t')
scaler = MinMaxScaler()
playlist_test_size=100
missing_rate=0.2
pd_playlist = pd_playlist.copy()
pd_track = pd_track.copy()
pd_audio = pd_audio.copy()
pd_track = pd.merge(pd_track, pd_audio, left_on='track_id', right_on='id')
pd_full = pd.merge(pd_track, pd_playlist, left_on='playlist_id', right_on='playlist_id')
pd_full = pd_full.drop('playlist_name', axis=1)
pd_full = pd_full.drop('playlist_description', axis=1)
pd_full = pd_full.drop('id', axis=1)
pd_full = pd_full.drop_duplicates()
pd_full = pd_full.fillna('None')
playlist_test = generate_playlist(pd_track, pd_playlist)
playlist_test_all_track = generate_playlist(pd_full, pd_playlist, num_playlist_to_test=playlist_test_size)
playlist_to_test = generate_test_playlist(playlist_test_all_track, missing_rate)
playlist_all_id_to_test = list(playlist_to_test.keys())
pd_full_test = pd_full.copy()
pd_full_test = pd_full_test[pd_full_test['playlist_id'].isin(playlist_all_id_to_test)]
pd_full_test = pd_full_test.reset_index(drop=True)
pd_full_test_copy = pd_full_test.copy()
columns_to_drop = ['playlist_id', 'track_id', 'track_name', 'artist_ids',
'artist_names', 'album_id', 'album_name',
'playlist_num_tracks', 'playlist_num_followers']
pd_full_test_copy = pd_full_test_copy.drop(columns_to_drop, axis=1)
sum = 0
for key in playlist_test_all_track.keys():
sum += len(playlist_test_all_track[key])
assert sum == pd_full_test_copy.shape[0]
scaler.fit(pd_full_test_copy)
scaledpd_full_test = scaler.transform(pd_full_test_copy)
class ColDropper(BaseEstimator, TransformerMixin):
def __init__(self):
self.columns_to_drop = ['playlist_id', 'track_id',
'track_name', 'artist_ids',
'artist_names', 'album_id',
'album_name', 'playlist_num_tracks',
'playlist_num_followers']
self.result_ = None
def fit(self, x, y=None):
return self
def transform(self, x, y=None):
result = x.copy()
result = result.drop(columns=self.columns_to_drop)
return result
plt.figure(figsize=(10, 5))
nn = NearestNeighbors(n_neighbors=5).fit(scaled_pd_full_test)
distances, idx = nn.kneighbors(scaled_pd_full_test)
distances = np.sort(distances, axis=0)
distances = distances[:, 1]
plt.plot(distances)
plt.show()
dbscan = DBSCAN(eps=0.38, algorithm='ball_tree')
dbscan.fit(scaled_pd_full_test)
label = dbscan.labels_
pd_full_test_copy['cluster_label'] = label
full_pipeline = Pipeline(
[('ColDropper', ColDropper()), ('Scaler', MinMaxScaler()), ('DBSCAN', DBSCAN(eps=0.38, algorithm='ball_tree'))])
full_pipeline
print(full_pipeline.get_params().keys())
playlist_id_to_test = playlist_all_id_to_test[0]
playlist_id_to_test
all_name_songs = list(
pd_full[pd_full['track_id'].isin(playlist_test_all_track[playlist_id_to_test])]['track_name'].values)
print('All song in playlist 0:', all_name_songs)
given_name_songs = list(pd_full[pd_full['track_id'].isin(playlist_to_test[playlist_id_to_test])]['track_name'].values)
print('Given in playlist 0 : ', given_name_songs)
def recommend_songs(given_playlist_track, pd_full_test, info_df, n_pred, eps=0.38, random_state=42, algo='ball_tree'):
pid = list(info_df[info_df['track_id'].isin(given_playlist_track)]['playlist_id'].value_counts().index)[0]
full_pipeline.set_params(DBSCAN__eps=eps, DBSCAN__algorithm=algo)
full_pipeline.fit(X=pd_full_test)
label = full_pipeline['DBSCAN'].labels_
info_df_copy = info_df.copy()
info_df_copy['cluster_label'] = label
most_cluster = list(info_df_copy[info_df['playlist_id'] == pid]['cluster_label'].value_counts().index)[0]
same_cluster_track_df = info_df[
~(info_df['track_id'].isin(given_playlist_track)) & (info_df_copy['cluster_label'] == most_cluster)]
same_cluster_track_df = same_cluster_track_df.sort_values(by='popularity', ascending=False)
result = []
recs_names = list(np.unique(same_cluster_track_df['track_name'].values))
preds = collections.Counter(recs_names).most_common(n_pred)
for pred in preds:
result.append(pred[0])
return result
recommended_songs = recommend_songs(given_playlist_track=playlist_to_test[playlist_id_to_test],
pd_full_test=pd_full_test,
info_df=pd_full_test,
n_pred=len(playlist_test_all_track[playlist_id_to_test]) - len(
playlist_to_test[playlist_id_to_test]))
print(recommended_songs)
def test_multi(pd_playlist, pd_full, missing_rate=0.2, nums_playlists_test=100):
r_score = []
playlist_test_all_track = generate_playlist(pd_full, pd_playlist, num_playlist_to_test=nums_playlists_test)
playlist_to_test = generate_test_playlist(playlist_test_all_track, missing_rate=missing_rate)
all_playlistID_for_tests = list(playlist_to_test.keys())
pd_full_tests = pd_full.copy()
pd_full_tests = pd_full_tests[pd_full_tests['playlist_id'].isin(all_playlistID_for_tests)]
pd_full_tests = pd_full_tests.reset_index(drop=True)
pd_full_tests_copy = pd_full_tests.copy()
recommended_songs_all_playlist = {}
for each_playlist in all_playlistID_for_tests:
recommended_songs = recommend_songs(playlist_to_test[each_playlist],
pd_full_tests_copy,
pd_full_tests,
len(playlist_test_all_track[each_playlist]) - len(
playlist_to_test[each_playlist]))
recommended_songs_all_playlist[each_playlist] = recommended_songs
all_name_songs = list(
pd_full[pd_full['track_id'].isin(playlist_test_all_track[each_playlist])]['track_name'].values)
given_name_songs = list(pd_full[pd_full['track_id'].isin(playlist_to_test[each_playlist])]['track_name'].values)
songs_need_to_recommend = [song for song in all_name_songs if song not in given_name_songs]
r_score.append(r_precision(recommended_songs, songs_need_to_recommend))
return r_score
r_score = test_multi(pd_playlist, pd_full, missing_rate=0.2, nums_playlists_test=100)
missing_rates = np.arange(0.1, 1, 0.1)
for missing_rate in missing_rates:
r_score = test_multi(pd_playlist, pd_full, missing_rate=missing_rate, nums_playlists_test=100)
print('*' * 50)
print('Mising rate = ', missing_rate)
print('Average R-Precision', np.array(r_score).mean())
print('Max R-Precision', np.array(r_score).max())
print('*' * 50)
def visualized_cluster_result(dataframe, model_label,suptitle="TSNE2D vs. PCA2D",path_to_save="../images/fig.png"):
sns.set();
X = dataframe.copy()
TSNE2 = TSNE(n_components=2,random_state=42).fit_transform(X)
dftsne = pd.DataFrame(TSNE2)
dftsne['cluster'] = model_label
dftsne.columns = ['x1','x2','cluster']
X = dataframe.copy()
PCA2 = PCA(n_components=2,random_state=42).fit_transform(X)
dfpca2 = pd.DataFrame(PCA2)
dfpca2['cluster'] = model_label
dfpca2.columns = ['x1','x2','cluster']
fig, ax = plt.subplots(1, 2, figsize=(18,9))
sns.scatterplot(data=dftsne,x='x1',y='x2',hue='cluster',legend="full",alpha=0.5,ax=ax[0],palette=plt.get_cmap('tab20'))
ax[0].set_title('Visualized on TSNE 2D')
sns.scatterplot(data=dfpca2,x='x1',y='x2',hue='cluster',legend="full",alpha=0.5,ax=ax[1],palette=plt.get_cmap('tab20'))
ax[1].set_title('Visualized on PCA 2D')
fig.suptitle(f'{suptitle}');
fig.savefig(path_to_save)
return dftsne, dfpca2
db_scan_dftsne, db_scan_dfpca2 = visualized_cluster_result(scaled_pd_full_test,label,"Visualized with 7000 tracks","dbscan_7000tracks_13cluster.png")
sns.set();
sns.set(rc={'figure.figsize':(15,15)})
sns.scatterplot(data=db_scan_dftsne,x='x1',y='x2',hue='cluster',legend="full",alpha=0.5,palette=plt.get_cmap('tab20'))
plt.savefig("db_scan_dftsne.png")
sns.set();
sns.set(rc={'figure.figsize':(15,15)})
sns.color_palette("Spectral", as_cmap=True)
sns.scatterplot(data=db_scan_dfpca2,x='x1',y='x2',hue='cluster',legend="full",alpha=0.5,palette=plt.get_cmap('tab20'))
plt.savefig("db_scan_dfpca2.png")
sns.set();
sns.set(rc={'figure.figsize':(15,15)})
gt_dftsne = db_scan_dftsne.copy()
gt_label = LabelEncoder().fit_transform(pd_full_test["playlist_id"])
gt_dftsne['cluster'] = gt_label
sns.scatterplot(data=gt_dftsne,x='x1',y='x2',hue='cluster',legend="full",alpha=0.5, palette=plt.get_cmap('tab20'))
plt.legend([],[], frameon=False)
plt.savefig("ground_truth.png")
nn = NearestNeighbors(n_neighbors=10).fit(scaled_pd_full_test)
distances, idx = nn.kneighbors(scaled_pd_full_test)
distances
full_pipeline = Pipeline([('ColDropper', ColDropper()), ('Scaler', MinMaxScaler()), ('KNN', NearestNeighbors())])
print(full_pipeline.get_params().keys())
playlist_id_to_test = playlist_all_id_to_test[0]
playlist_id_to_test
all_name_songs = list(
pd_full[pd_full['track_id'].isin(playlist_test_all_track[playlist_id_to_test])]['track_name'].values)
print('All song in playlist 0:', all_name_songs)
given_name_songs = list(pd_full[pd_full['track_id'].isin(playlist_to_test[playlist_id_to_test])]['track_name'].values)
print('Given in playlist 0 : ', given_name_songs)
def recommend_songs(given_playlist_track, pd_full_test, info_df, n_pred):
pid = list(info_df[info_df['track_id'].isin(given_playlist_track)]['playlist_id'].value_counts().index)[0]
full_pipeline.set_params(KNN__n_neighbors=n_pred)
full_pipeline.fit(X=pd_full_test)
pd_given_feature = info_df[info_df['track_id'].isin(given_playlist_track)]
pd_given_feature = full_pipeline['ColDropper'].fit_transform(pd_given_feature)
pd_given_feature = full_pipeline['Scaler'].fit_transform(pd_given_feature)
distances, idx = full_pipeline['KNN'].kneighbors(pd_given_feature)
idx = idx[:, 1:]
distances = distances[:, 1:]
idx = idx.flatten()
counter = collections.Counter(idx)
a = counter.most_common(len(counter))
result = []
track_duplicates = []
i = 0
k = 0
while k != n_pred:
track_id = info_df.loc[a[i][0], 'track_id']
if (track_id not in given_playlist_track) and (track_id not in track_duplicates):
track_duplicates.append(track_id)
track_name = info_df.loc[a[i][0], 'track_name']
result.append(track_name)
i += 1
k += 1
else:
i += 1
return result
recommended_songs = recommend_songs(given_playlist_track=playlist_to_test[playlist_id_to_test],
pd_full_test=pd_full_test,
info_df=pd_full_test,
n_pred=len(playlist_test_all_track[playlist_id_to_test]) - len(
playlist_to_test[playlist_id_to_test]))
recommended_songs = recommend_songs(
given_playlist_track=['5HCyWlXZPP0y6Gqq8TgA20', '4XvcHTUfIlWfyJTRG0aqlo', '27NovPIUIRrOZoCHxABJwK'],
pd_full_test=pd_full_test,
info_df=pd_full_test,
n_pred=len(playlist_test_all_track[playlist_id_to_test]) - len(playlist_to_test[playlist_id_to_test]))
print(recommended_songs)
def test_multi(pd_playlist, pd_full, missing_rate=0.2, nums_playlists_test=100):
r_score = []
playlist_test_all_track = generate_playlist(pd_full, pd_playlist, num_playlist_to_test=nums_playlists_test)
playlist_to_test = generate_test_playlist(playlist_test_all_track, missing_rate=missing_rate)
all_playlistID_for_tests = list(playlist_to_test.keys())
pd_full_tests = pd_full.copy()
pd_full_tests = pd_full_tests[pd_full_tests['playlist_id'].isin(all_playlistID_for_tests)]
pd_full_tests = pd_full_tests.reset_index(drop=True)
pd_full_tests_copy = pd_full_tests.copy()
recommended_songs_all_playlist = {}
for each_playlist in all_playlistID_for_tests:
recommended_songs = recommend_songs(playlist_to_test[each_playlist],
pd_full_tests_copy,
pd_full_tests,
len(playlist_test_all_track[each_playlist]) - len(
playlist_to_test[each_playlist]))
recommended_songs_all_playlist[each_playlist] = recommended_songs
all_name_songs = list(
pd_full[pd_full['track_id'].isin(playlist_test_all_track[each_playlist])]['track_name'].values)
given_name_songs = list(pd_full[pd_full['track_id'].isin(playlist_to_test[each_playlist])]['track_name'].values)
songs_need_to_recommend = [song for song in all_name_songs if song not in given_name_songs]
r_score.append(r_precision(recommended_songs, songs_need_to_recommend))
return r_score
r_score = test_multi(pd_playlist, pd_full, missing_rate=0.2, nums_playlists_test=100)
missing_rates = np.arange(0.1, 1, 0.1)
for missing_rate in missing_rates:
r_score = test_multi(pd_playlist, pd_full, missing_rate=missing_rate, nums_playlists_test=100)
print('*' * 50)
print('Mising rate = ', missing_rate)
print('Average R-Precision', np.array(r_score).mean())
print('Max R-Precision', np.array(r_score).max())
print('*' * 50)
Sum_of_squared_distances = []
K = range(3, 40)
for k in K:
kmeans = KMeans(n_clusters=k)
kmeans = kmeans.fit(scaled_pd_full_test)
Sum_of_squared_distances.append(kmeans.inertia_)
plt.plot(K, Sum_of_squared_distances, 'bx-')
plt.xlabel('k')
plt.ylabel('Sum_of_squared_distances')
plt.title('Elbow Method For Optimal k')
plt.show()
kmeans = KMeans(n_clusters=13)
kmeans.fit(scaled_pd_full_test)
KMeans(n_clusters=13)
label = kmeans.labels_
pd_full_test_copy['cluster_label'] = label
pd_full_test_copy
full_pipeline = Pipeline([('ColDropper', ColDropper()), ('Scaler', MinMaxScaler()), ('KMeans', KMeans(n_clusters=13))])
print(full_pipeline.get_params().keys())
playlist_id_to_test = playlist_all_id_to_test[0]
playlist_id_to_test
all_name_songs = list(
pd_full[pd_full['track_id'].isin(playlist_test_all_track[playlist_id_to_test])]['track_name'].values)
print('All song in playlist 0:', all_name_songs)
given_name_songs = list(pd_full[pd_full['track_id'].isin(playlist_to_test[playlist_id_to_test])]['track_name'].values)
print('Given in playlist 0 : ', given_name_songs)
def recommend_songs(given_playlist_track, pd_full_test, info_df, n_pred, num_clusters=13, random_state=42):
pid = list(info_df[info_df['track_id'].isin(given_playlist_track)]['playlist_id'].value_counts().index)[0]
# df_features = pd.DataFrame(df_scaled_features.copy())
full_pipeline.set_params(KMeans__n_clusters=num_clusters, KMeans__random_state=random_state)
full_pipeline.fit(X=pd_full_test)
label = full_pipeline['KMeans'].labels_
info_df_copy = info_df.copy()
info_df_copy['cluster_label'] = label
most_cluster = list(info_df_copy[info_df['playlist_id'] == pid]['cluster_label'].value_counts().index)[0]
same_cluster_track_df = info_df[
~(info_df['track_id'].isin(given_playlist_track)) & (info_df_copy['cluster_label'] == most_cluster)]
same_cluster_track_df = same_cluster_track_df.sort_values(by='popularity', ascending=False)
result = []
recs_names = list(np.unique(same_cluster_track_df['track_name'].values))
preds = collections.Counter(recs_names).most_common(n_pred)
for pred in preds:
result.append(pred[0])
return result
recommended_songs = recommend_songs(given_playlist_track=playlist_to_test[playlist_id_to_test],
pd_full_test=pd_full_test,
info_df=pd_full_test,
n_pred=len(playlist_test_all_track[playlist_id_to_test]) - len(
playlist_to_test[playlist_id_to_test]))
print(recommended_songs)
def test_multi(pd_playlist, pd_full, missing_rate=0.2, nums_playlists_test=100):
r_score = []
playlist_test_all_track = generate_playlist(pd_full, pd_playlist, num_playlist_to_test=nums_playlists_test)
playlist_to_test = generate_test_playlist(playlist_test_all_track, missing_rate=missing_rate)
all_playlistID_for_tests = list(playlist_to_test.keys())
pd_full_tests = pd_full.copy()
pd_full_tests = pd_full_tests[pd_full_tests['playlist_id'].isin(all_playlistID_for_tests)]
pd_full_tests = pd_full_tests.reset_index(drop=True)
pd_full_tests_copy = pd_full_tests.copy()
recommended_songs_all_playlist = {}
for each_playlist in all_playlistID_for_tests:
recommended_songs = recommend_songs(playlist_to_test[each_playlist],
pd_full_tests_copy,
pd_full_tests,
len(playlist_test_all_track[each_playlist]) - len(
playlist_to_test[each_playlist]))
recommended_songs_all_playlist[each_playlist] = recommended_songs
all_name_songs = list(
pd_full[pd_full['track_id'].isin(playlist_test_all_track[each_playlist])]['track_name'].values)
given_name_songs = list(pd_full[pd_full['track_id'].isin(playlist_to_test[each_playlist])]['track_name'].values)
songs_need_to_recommend = [song for song in all_name_songs if song not in given_name_songs]
r_score.append(r_precision(recommended_songs, songs_need_to_recommend))
return r_score
r_score = test_multi(pd_playlist, pd_full, missing_rate=0.2, nums_playlists_test=100)
missing_rates = np.arange(0.1, 1, 0.1)
for missing_rate in missing_rates:
r_score = test_multi(pd_playlist, pd_full, missing_rate=missing_rate, nums_playlists_test=100)
print('*' * 50)
print('Mising rate = ', missing_rate)
print('Average R-Precision', np.array(r_score).mean())
print('Max R-Precision', np.array(r_score).max())
print('*' * 50)
def visualized_cluster_result(dataframe, model_label,suptitle="TSNE2D vs. PCA2D",path_to_save="../images/fig.png"):
sns.set();
X = dataframe.copy()
TSNE2 = TSNE(n_components=2,random_state=42).fit_transform(X)
dftsne = pd.DataFrame(TSNE2)
dftsne['cluster'] = model_label
dftsne.columns = ['x1','x2','cluster']
X = dataframe.copy()
PCA2 = PCA(n_components=2,random_state=42).fit_transform(X)
dfpca2 = pd.DataFrame(PCA2)
dfpca2['cluster'] = model_label
dfpca2.columns = ['x1','x2','cluster']
fig, ax = plt.subplots(1, 2, figsize=(18,9))
sns.scatterplot(data=dftsne,x='x1',y='x2',hue='cluster',legend="full",alpha=0.5,ax=ax[0],palette=plt.get_cmap('tab20'))
ax[0].set_title('Visualized on TSNE 2D')
sns.scatterplot(data=dfpca2,x='x1',y='x2',hue='cluster',legend="full",alpha=0.5,ax=ax[1],palette=plt.get_cmap('tab20'))
ax[1].set_title('Visualized on PCA 2D')
fig.suptitle(f'{suptitle}');
fig.savefig(path_to_save)
return dftsne, dfpca2
kmeans_dftsne, kmeans_dfpca2 = visualized_cluster_result(scaled_pd_full_test,label,"Visualized with 7000 tracks","kmean_7000tracks_13cluster.png")
sns.set();
sns.set(rc={'figure.figsize':(15,15)})
sns.scatterplot(data=kmeans_dftsne,x='x1',y='x2',hue='cluster',legend="full",alpha=0.5,palette=plt.get_cmap('tab20'))
plt.savefig("kmeans_dftsne.png")
sns.set();
sns.set(rc={'figure.figsize':(15,15)})
sns.scatterplot(data=kmeans_dfpca2,x='x1',y='x2',hue='cluster',legend="full",alpha=0.5,palette=plt.get_cmap('tab20'))
plt.savefig("kmeans_dfpca2.png")
!pip install -q watermark
%reload_ext watermark
%watermark -a "Sparsh A." -m -iv -u -t -d
END